library(haven)
library(caret)
library(gridExtra)
library(corrplot)
library(e1071)
library(car)
library(lattice)
library(doParallel)
library(RANN)
library(rpart)
library(party)
library(partykit)
library(rpart.plot)
library(randomForest)
library(RWeka)
library(gbm)
library(Cubist)
library(tidyverse)
cl <- makeCluster(6)
registerDoParallel(cl)
Demographic <- read_xpt("P_DEMO.XPT")
BodySize <- read_xpt("P_BMX.XPT")
Chol_ldl <- read_xpt("P_TRIGLY.XPT")
#Select Variables of interest
Chol_ldl <- Chol_ldl %>% select(SEQN, LBDLDL)
#NA in target feature won't be useful
Chol_ldl <- Chol_ldl %>% drop_na()
#Get rid of variables we don't need
Drop_col <- c('SDDSRVYR', 'RIDSTATR', 'RIDEXMON', 'SIAPROXY', 'SIAINTRP', 'FIAPROXY', 'FIAINTRP', 'MIAPROXY', 'MIAINTRP', 'WTINTPRP', 'WTMECPRP', 'SDMVPSU', 'SDMVSTRA')
Demographic <- Demographic %>% select(-one_of(Drop_col))
Drop_col <- c('BMIWT', 'BMIRECUM', 'BMIHEAD', 'BMIHT', 'BMILEG', 'BMIARML', 'BMIARMC', 'BMIWAIST', 'BMIHIP', 'BMDSTATS')
BodySize <- BodySize %>% select(-one_of(Drop_col))
J1 <- Chol_ldl %>% left_join(Demographic, by = "SEQN")
Chol <- J1 %>% left_join(BodySize, by = "SEQN")
Chol <- Chol %>% select(!SEQN)
Chol_2 <- Chol
factors <- c("RIAGENDR", "RIDRETH1", "RIDRETH3", "DMDBORN4", "DMDEDUC2", "DMDMARTZ", "RIDEXPRG", "SIALANG", "FIALANG", "MIALANG", "AIALANGA")
Chol_2[,factors] <- lapply(Chol_2[,factors], factor)
levels(Chol_2$RIAGENDR) <- c("Male", "Female")
levels(Chol_2$RIDRETH1) <- c("Mex", "OHis", "White", "Black", "Oth")
levels(Chol_2$RIDRETH3) <- c("Mex", "OHis", "White", "Black", "Asian", "Oth")
levels(Chol_2$DMDBORN4) <- c("USA", "Oth", "Ref", "DK")
levels(Chol_2$DMDYRUSZ) <- c("<5", "5-15", "15-30", ">30", "Ref", "DK")
levels(Chol_2$DMDEDUC2) <- c("<9", "9-11", "HS", "AA", "BS+", "Ref", "DK")
levels(Chol_2$DMDMARTZ) <- c("Mar", "Sep", "Nev", "Ref", "DK")
levels(Chol_2$RIDEXPRG) <- c("Yes", "No", "DK")
levels(Chol_2$SIALANG) <- c("English", "Spanish")
levels(Chol_2$FIALANG) <- c("English", "Spanish")
levels(Chol_2$MIALANG) <- c("English", "Spanish")
levels(Chol_2$AIALANGA) <- c("English", "Spanish", "Asian")
Variable_na <- Chol_2 %>% select(everything()) %>% summarise_all(funs(sum(is.na(.)))) %>% pivot_longer(cols = c(colnames(Chol_2[,1:ncol(Chol_2)])), names_to = "Variable", values_to = "Missing") %>% arrange(desc(Missing))
Drop_col <- c("RIDAGEMN", "BMXRECUM", "BMXHEAD", "BMDBMIC", "RIDEXPRG", "DMDYRUSZ")
Chol_2 <- Chol_2 %>% select(-one_of(Drop_col))
row_na <- rowSums(is.na(Chol_2))
row_na <- data.frame(row_na, Row = c(1:length(row_na)))
row_na <- row_na %>% arrange(desc(row_na))
#Most missing values in a row is 12, not bad
#Looks like a fairly normal distribution, maybe a little skewed to the right.
ggplot(Chol_2, aes(x = LBDLDL)) + geom_histogram() + ggtitle("Distribution of LDL Cholesterol") + xlab("LDL Chol.") + ylab("Count")
skewness(Chol_2$LBDLDL)
## [1] 0.7886403
#skewness value .7886403 confirms very mild skewness to the right
Factors
Chol_fact <- Chol_2 %>% select_if(is.factor)
Chol.bar <- function(xvar){
ggplot(Chol_fact, aes_(x = as.name(xvar))) +
geom_bar(color = "black") + coord_flip()
}
Lang_barplots <- lapply(names(Chol_fact[,7:10]), Chol.bar)
Oth_barplots <- lapply(names(Chol_fact[,1:6]), Chol.bar)
grid.arrange(grobs = Lang_barplots, top = "Language Features")
grid.arrange(grobs = Oth_barplots, top = "Other Demographics")
Numeric
Chol_num <- Chol_2 %>% select_if(is.numeric) %>% select(!LBDLDL)
Chol.hist <- function(xvar){
ggplot(Chol_num, aes_(x = as.name(xvar))) +
geom_histogram(color = "black")
}
Dem_hist <- lapply(names(Chol_num[,1:2]), Chol.hist)
Body_hist <- lapply(names(Chol_num[,3:10]), Chol.hist)
grid.arrange(grobs = Dem_hist, top = "Demographic Features")
grid.arrange(grobs = Body_hist, top = "Body Measures")
Heatmap
Chol_dummy <- fastDummies::dummy_cols(Chol_2)
Chol_dummy <- Chol_dummy %>% select_if(~!is.factor(.))
Chol_dummy[] <- lapply(Chol_dummy, as.numeric)
Chol_cor <- cor(Chol_dummy, use = "complete.obs")
Chol_corplot <- corrplot(cor(Chol_dummy, use = "complete.obs"), tl.pos = 'n')
#Looks like some dummy variables that are refusal could be messing up correlations
Drop_col <- c("DMDBORN4_Ref", "DMDBORN4_DK", "DMDEDUC2_Ref", "DMDEDUC2_DK", "DMDEDUC2_NA", "DMDMARTZ_Ref", "DMDMARTZ_NA", "FIALANG_NA", "MIALANG_NA", "AIALANGA_NA")
Chol_dummy_2 <- Chol_dummy %>% select(-one_of(Drop_col))
invisible(cor(Chol_dummy_2, use = "complete.obs")) # using invisible() to reduce extensive output
corrplot(cor(Chol_dummy_2, use = "complete.obs"), tl.pos = 'n', type = 'lower')
### Smaller plots for easier interpretation.
# Mini Correlations: Sociological Measures:
socio <- Chol_dummy_2[,c("RIAGENDR_Male","RIAGENDR_Female",
"RIDRETH1_Mex", "RIDRETH1_OHis", "RIDRETH1_White",
"RIDRETH1_Black", "RIDRETH1_Oth", "RIDRETH3_Mex", "RIDRETH3_OHis",
"RIDRETH3_White", "RIDRETH3_Black", "RIDRETH3_Asian", "RIDRETH3_Oth",
"DMDBORN4_USA", "DMDBORN4_Oth", "DMDEDUC2_<9", "DMDEDUC2_9-11",
"DMDEDUC2_HS", "DMDEDUC2_AA", "DMDEDUC2_BS+","DMDMARTZ_Mar",
"DMDMARTZ_Sep", "DMDMARTZ_Nev", "DMDMARTZ_DK", "SIALANG_English",
"SIALANG_Spanish", "FIALANG_English", "FIALANG_Spanish",
"MIALANG_English","MIALANG_Spanish", "AIALANGA_English",
"AIALANGA_Spanish", "AIALANGA_Asian", "LBDLDL")]
invisible(cor(socio, use = "complete.obs")) # using invisible() to reduce extensive output
corrplot(cor(socio, use = "complete.obs"), tl.pos = 'y', type = 'lower',
order = "hclust", tl.cex = 0.5)
# Mini Correlations: Biological Measures:
biologic <- Chol_dummy_2[,c("RIDAGEYR", "BMXWT", "BMXHT", "BMXBMI",
"BMXLEG", "BMXARML", "BMXARMC", "BMXWAIST",
"BMXHIP", "RIAGENDR_Male", "RIAGENDR_Female","LBDLDL")]
invisible(cor(biologic, use = "complete.obs")) # using invisible() to reduce extensive output
corrplot(cor(biologic, use = "complete.obs"), tl.pos = 'y', type = 'lower',
order = "hclust", tl.cex = 0.8)
# correlations between certain biological measures make sense. BMI is derived from the MASS and height of an individual, so it makes sense that many of the BMI measurements correlate with each other. (i.e. hip, waist, and weight measurements correlate with a higher BMI. Being female correlates negatively with leg and arm length as well as height)
# looking at a base linear model, to see significant variables
model0 <- lm(LBDLDL~., Chol_2)
invisible(summary(model0)) # invisible() used to reduce output. Key observations will be noted below:
# significant contributors: variable (Pr(>|t|))
# RIDAGEYR (0.000132)
# (Intercept) (0.043474)
# MDBORN4Oth (0.076414)
# AIALANGASpanish (0.041803)
# BMXLEG (0.033051)
# BMXARMC (0.004507)
# BMXWAIST (0.071888)
Looking at VIF of simple model showed aliased coefficients. Removed them in the model training process.
# looking at VIF for baseline linear:
# vif(model0)
# highly/perfectly correlated factors, we might need to drop some
# Let's check for degenerate predictors from the original dataset
nearZeroVar(Chol, saveMetrics = FALSE)
## [1] 4 11 16 18 19
deg_chol <- subset(Chol, select=c(4,11,16,18,19))
colnames(deg_chol)
## [1] "RIDAGEMN" "RIDEXPRG" "INDFMPIR" "BMXRECUM" "BMXHEAD"
# Do it again on factor dataset
nearZeroVar(Chol_2, saveMetrics = FALSE)
## [1] 13
deg_chol2 <- subset(Chol_2, select=c(13))
colnames(deg_chol2)
## [1] "INDFMPIR"
# we may have to consider removing depending on the data used for modeling
# set the seed and split the data. We'll do an 80/20 split
set.seed(123)
Chol_split <- createDataPartition(Chol$LBDLDL, p=0.80, list=FALSE)
# split into train and test
Chol_train <- Chol_dummy[Chol_split,]
Chol_test <- Chol_dummy[-Chol_split,]
# split predictors from the target
Chol_train_X <- as.data.frame(subset(Chol_train, select=-c(LBDLDL)))
Chol_train_y <- Chol_train$LBDLDL
Chol_test_X <- as.data.frame(subset(Chol_test, select=-c(LBDLDL)))
Chol_test_y <- Chol_test$LBDLDL
# Creating imputed data sets
Chol_imp <- preProcess(Chol_train_X, method = c("center", "scale", "knnImpute"))
Chol_train_X_imp <- predict(Chol_imp, Chol_train_X)
Chol_test_X_imp <- predict(Chol_imp, Chol_test_X)
# Adding Resampling/Validation Set and Control
set.seed(123)
Chol_folds <- createFolds(y = Chol_train_X, k = 10, returnTrain = T)
Chol_control <- trainControl(method = "cv", index = Chol_folds)
Drop_col <- c('RIDRETH1', 'RIDRETH3', 'DMDBORN4', 'DMDEDUC2', 'DMDMARTZ', 'SIALANG', 'FIALANG', 'MIALANG', 'AIALANGA', 'LBDLDL')
Chol_num <- Chol_2 %>% select(-one_of(Drop_col))
Chol_dummy <- fastDummies::dummy_cols(Chol_num)
Chol_num <- Chol_dummy %>% select_if(~!is.factor(.))
Chol_num[] <- lapply(Chol_num, as.numeric)
Chol_num_tr_X <- as.data.frame(Chol_num[Chol_split, ])
Chol_num_test_X <- as.data.frame(Chol_num[-Chol_split, ])
#Preprocess
Chol_imp <- preProcess(Chol_num_tr_X, method = c("center", "scale", "knnImpute"))
Chol_num_tr_X <- predict(Chol_imp, Chol_num_tr_X)
Chol_num_test_X <- predict(Chol_imp, Chol_num_test_X)
# Adding Resampling/Validation Set and Control
set.seed(123)
Chol_folds_num <- createFolds(y = Chol_num_tr_X, k = 10, returnTrain = T)
Chol_control_num <- trainControl(method = "cv", index = Chol_folds_num)
Chol_ols_tune <- train(x = Chol_train_X_imp, y = Chol_train_y, method = "lm", trControl = Chol_control)
plot(Chol_ols_tune$finalModel)
### FIX TRAINING SET
# VIF shows aliased coefficients, need to get rid of those by removing high cor predictors
test <- cor(Chol_train_X_imp)
# Also have an issue with DMDEDUC2_DK all being zero so get rid of high var predictors
Chol_tr_x_imp_vr <- Chol_train_X_imp[, -nearZeroVar(Chol_train_X_imp)]
Chol_tr_X_imp_fin <- Chol_tr_x_imp_vr[, -findCorrelation(cor(Chol_tr_x_imp_vr), cutoff = 0.9)]
Chol_ols_tune2 <- train(x = Chol_tr_X_imp_fin, y = Chol_train_y, method = "lm", trControl = Chol_control)
plot(Chol_ols_tune2$finalModel)
invisible(summary(Chol_ols_tune2$finalModel)) # invisible used to minimize output clutter. Key observations to be noted in final report
Chol_bct <- preProcess(Chol_tr_X_imp_fin, method = "BoxCox")
Chol_tr_boxcox <- predict(Chol_bct, Chol_tr_X_imp_fin)
Chol_ols_tune3 <- train(x = Chol_tr_boxcox, y = Chol_train_y, method = "lm", trControl = Chol_control)
plot(Chol_ols_tune3$finalModel)
invisible(summary(Chol_ols_tune3$finalModel)) #invisible used to reduce cluttered output. Key observations noted in final report
Chol_ols_tune_num <- train(x = Chol_num_tr_X, y = Chol_train_y, method = "lm", trControl = Chol_control_num)
plot(Chol_ols_tune_num$finalModel)
invisible(summary(Chol_ols_tune_num$finalModel)) # hiding long output to reduce clutter. Key observations to be noted in final report
Chol_sig_tr <- Chol_tr_X_imp_fin %>% select(RIDAGEYR, BMXHT, BMXBMI, BMXLEG, BMXARMC, DMDBORN4_Oth, DMDEDUC2_NA, MIALANG_NA, AIALANGA_NA)
Chol_ols <- train(x = Chol_sig_tr, y = Chol_train_y, method = "lm", trControl = Chol_control)
plot(Chol_ols$finalModel)
invisible(summary(Chol_ols$finalModel)) # hidden to reduce output clutter. Key observations to be noted
#Predict on test data
Chol_ols_res <- predict(Chol_ols, Chol_test_X)
set.seed(123)
Chol_pcr <- train(x = Chol_tr_X_imp_fin, y = Chol_train_y, method = "pcr", tuneGrid = expand.grid(ncomp=1:32), trControl = Chol_control)
invisible(Chol_pcr) # output hidden in final report to reduce clutter
set.seed(123)
Chol_pcr_box <- train(x = Chol_tr_boxcox, y = Chol_train_y, method = "pcr", tuneGrid = expand.grid(ncomp=1:32), trControl = Chol_control)
invisible(Chol_pcr_box) # output hidden in final report to reduce clutter
set.seed(123)
Chol_pcr_num <- train(x = Chol_num_tr_X, y = Chol_train_y, method = "pcr", tuneGrid = expand.grid(ncomp=1:8), trControl = Chol_control_num)
invisible(Chol_pcr_num) # output hidden in final report to reduce clutter
pcr_resamp <- Chol_pcr$results
pcr_resamp$Model <- "PCR"
box_pcr_resamp <- Chol_pcr_box$results
box_pcr_resamp$Model <- "BPCR"
num_pcr_resamp <- Chol_pcr_num$results
num_pcr_resamp$Model <- "PCR"
# key observations and summary to be noted in final report
set.seed(123)
Chol_pls <- train(x = Chol_tr_X_imp_fin, y = Chol_train_y, method = "pls", tuneGrid = expand.grid(ncomp = 1:32), trControl = Chol_control)
invisible(Chol_pls) # output hidden to reduce clutter
set.seed(123)
Chol_pls_box <- train(x = Chol_tr_boxcox, y = Chol_train_y, method = "pls", tuneGrid = expand.grid(ncomp = 1:32), trControl = Chol_control)
invisible(Chol_pls_box) # output hidden to reduce clutter
set.seed(123)
Chol_pls_num <- train(x = Chol_num_tr_X, y = Chol_train_y, method = "pls", tuneGrid = expand.grid(ncomp = 1:8), trControl = Chol_control_num)
invisible(Chol_pls_num) # output hidden to reduce clutter
pls_resamp <- Chol_pls$results
pls_resamp$Model <- "PLS"
pls_box_resamp <- Chol_pls_box$results
pls_box_resamp$Model <- "BPLS"
pls_num_resamp <- Chol_pls_num$results
pls_num_resamp$Model <- "PLS"
plot_data <- rbind(pcr_resamp, box_pcr_resamp, pls_resamp, pls_box_resamp)
xyplot(RMSE ~ ncomp, data = plot_data, xlab = "# of Components", ylab = "RMSE (Cross-validation)", auto.key = list(columns = 4), groups = Model, type = c("o", "g"))
plot2_data <- rbind(num_pcr_resamp, pls_num_resamp)
xyplot(RMSE ~ ncomp, data = plot2_data, xlab = "# of Components", ylab = "RMSE (Cross-validation)", auto.key = list(columns = 2), groups = Model, type = c("o", "g"))
set.seed(123)
Chol_ridge <- train(x = Chol_tr_X_imp_fin, y= Chol_train_y, method = "ridge", tuneGrid = expand.grid(lambda = seq(0, .1, length = 15)), trControl = Chol_control)
invisible(Chol_ridge) # model output hidden to reduce clutter
set.seed(123)
Chol_ridge_box <- train(x = Chol_tr_boxcox, y= Chol_train_y, method = "ridge", tuneGrid = expand.grid(lambda = seq(0, .5, length = 15)), trControl = Chol_control)
invisible(Chol_ridge_box) # model output hidden to reduce clutter
set.seed(123)
Chol_ridge_num <- train(x = Chol_num_tr_X, y= Chol_train_y, method = "ridge", tuneGrid = expand.grid(lambda = seq(0, .5, length = 15)), trControl = Chol_control_num)
invisible(Chol_ridge_num) # model output hidden to reduce clutter
print(update(plot(Chol_ridge), xlab = "Penalty"))
print(update(plot(Chol_ridge_box), xlab = "Penalty"))
print(update(plot(Chol_ridge_num), xlab = "Penalty"))
enet_grid <- expand.grid(lambda = c(0, 0.01, 0.1), fraction = seq(0.05, 1, length = 20))
Chol_enet <- train(x = Chol_tr_X_imp_fin, y = Chol_train_y, method = "enet", tuneGrid = enet_grid, trControl = Chol_control)
invisible(Chol_enet) # model output hidden to reduce clutter
Chol_enet_box <- train(x = Chol_tr_X_imp_fin, y = Chol_train_y, method = "enet", tuneGrid = enet_grid, trControl = Chol_control)
invisible(Chol_enet_box) # model output hidden to reduce clutter
Chol_enet_num <- train(x = Chol_num_tr_X, y = Chol_train_y, method = "enet", tuneGrid = enet_grid, trControl = Chol_control_num)
plot(Chol_enet)
plot(Chol_enet_box)
plot(Chol_enet_num)
Note: No diagnostic plot showed much of a difference between boxcox vs. regular (not suprising given normality in histograms), so we’ll just use the regular constructed models
Res_OLS <- predict(Chol_ols, Chol_test_X_imp)
Res_OLS_num <- predict(Chol_ols_tune_num, Chol_num_test_X)
Res_PLS <- predict(Chol_pls, Chol_test_X_imp)
Res_PLS_num <- predict(Chol_pls_num, Chol_num_test_X)
Res_PCR <- predict(Chol_pcr, Chol_test_X_imp)
Res_PCR_num <- predict(Chol_pcr_num, Chol_num_test_X)
Res_Ridge <- predict(Chol_ridge, Chol_test_X_imp)
Res_Ridge_num <- predict(Chol_ridge_num, Chol_num_test_X)
Res_Enet <- predict(Chol_enet, Chol_test_X_imp)
Res_Enet_num <- predict(Chol_enet_num, Chol_num_test_X)
Linear_res <- cbind.data.frame(Observed = Chol_test_y, OLS = Res_OLS, OLS_num = Res_OLS_num, PLS = Res_PLS, PLS_num = Res_PLS_num, PCR = Res_PCR, PCR_num = Res_PCR_num ,Ridge = Res_Ridge, Ridge_num = Res_Ridge_num, ENet = Res_Enet, ENet_num = Res_Enet_num)
find_rmse <- function(x){
caret::RMSE(x, Linear_res[,"Observed"])
}
RMSE_results <- apply(X = Linear_res[,2:11], FUN = find_rmse, MARGIN = 2)
RMSE_results <- data.frame(RMSE_results)
RMSE_results$Model <- rownames(RMSE_results)
ggplot(RMSE_results, aes(x=reorder(Model, -RMSE_results), y=RMSE_results)) + geom_segment(aes(x=reorder(Model, -RMSE_results), xend = reorder(Model, -RMSE_results), y=30, yend=RMSE_results), color = "cadetblue") + geom_point(color = "darkblue", size = 10) + coord_flip() + ylab("RMSE") + geom_text(aes(label = round(RMSE_results, 2)), color = "white", size = 2.5)
# initial SVM model with radial basis and processed Chol_tr_X_imp_fin and Chol_train_y
set.seed(123)
svmR0 <- train(x=Chol_tr_X_imp_fin, y = Chol_train_y,
method = "svmRadial",
preProcess = c("center", "scale"),
tuneLength = 14,
trControl = Chol_control)
invisible(svmR0) # model output hidden to reduce clutter. Key observations noted below:
# final model uses: sigma = 0.0207376 and C = 0.25
# RMSE: 35.40559, Rsquared: 0.019455264
plot(svmR0, scales = list(x = list(log = 2)), main="SVM-Radial Initial")
# svm radial model v2
# issue causing variables in X: MIALANG_Spanish, DMDEDUC2_<9, MIALANG_NA (zero var)
Chol_tr_X_impfin_drop <- c("MIALANG_Spanish", "DMDEDUC2_<9", "MIALANG_NA")
Chol_tr_X_impfin_sv <- subset(Chol_tr_X_imp_fin,
select = !(names(Chol_tr_X_imp_fin) %in% Chol_tr_X_impfin_drop))
#making test X have same columns available
Chol_te_X_sv <- subset(Chol_test_X_imp, select = c(names(Chol_tr_X_impfin_sv)))
dim(Chol_tr_X_impfin_sv)
## [1] 3696 29
dim(Chol_te_X_sv)
## [1] 921 29
# sigma grid instead of using tuneLength = 14
sigmaEst <- kernlab::sigest(as.matrix(Chol_tr_X_impfin_sv[,1:29]))
Csearch <- 2^seq(-4,+4)
# sigma estimates using kernlab's sigest function
svmgrid <- expand.grid(sigma = sigmaEst, C = Csearch)
#model
set.seed(123)
svmR1 <- train(x=Chol_tr_X_impfin_sv, y = Chol_train_y,
method = "svmRadial",
preProcess = c("center", "scale"),
tuneGrid = svmgrid,
trControl = Chol_control)
invisible(svmR1) # model output hidden to reduce clutter. Key observations noted below:
# final model uses: sigma = 0.03504524 and C = 0.25.
# RMSE: 35.30443, Rsquared: 0.020245225
plot(svmR1, scales = list(x = list(log = 2)), main="SVM-Radial v2")
svmR1$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 0.25
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.035045237945048
##
## Number of Support Vectors : 3377
##
## Objective Function Value : -557.6121
## Training error : 0.82428
# going to use the x training set from final svm-radial due to assuming there will be the same problem causing factors of nearZeroVar.
set.seed(123)
svmP <- train(x=Chol_tr_X_impfin_sv, y = Chol_train_y,
method = "svmPoly",
preProcess = c("center", "scale"),
tuneGrid = expand.grid(degree = 1:3,
scale = c(0.01, 0.005, 0.001, 0.0005),
C = Csearch),
trControl = Chol_control)
invisible(svmP) # model output hidden to reduce clutter. Key observations noted below:
# final model uses: degree = 2, scale = 0.001, offset = 1
# sigma = 0.02231109 and C = 2.
# RMSE: 35.44929, Rsquared: 0.011746076
plot(svmP, scales = list(x = list(log = 2),
between=list(x=.5, y=1)), main="SVM-Polynomial")
svmP$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 2
##
## Polynomial kernel function.
## Hyperparameters : degree = 2 scale = 0.001 offset = 1
##
## Number of Support Vectors : 3366
##
## Objective Function Value : -4741.157
## Training error : 0.932111
# KNN Model needs to have NZV removed so again we are using the x=Chol_tr_X_impfin_sv to train and Chol_te_X_sv to test
set.seed(123)
knnTune <- train(x=Chol_tr_X_impfin_sv, y = Chol_train_y,
method = "knn",
preProcess = c("center", "scale"),
tuneGrid = data.frame(k=1:40),
trControl = Chol_control)
invisible(knnTune) # model output hidden to reduce clutter. Key observations noted below:
# final model uses: k=40
# RMSE: 35.63394, Rsquared: 0.0006650456
plot(knnTune, main="KNN")
knnTune$finalModel
## 20-nearest neighbor regression model
# MARS model doesn't need preprocessing, so first rendition will be with Chol_tr_X_imp_fin
set.seed(123)
mars1 <- train(x = Chol_tr_X_imp_fin, y = Chol_train_y,
method = "earth",
tuneGrid = expand.grid(degree = 1:3, nprune = 2:38),
trControl = Chol_control)
## Loading required package: earth
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
mars1$finalModel
## Selected 2 of 18 terms, and 1 of 32 predictors (nprune=2)
## Termination condition: RSq changed by less than 0.001 at 18 terms
## Importance: RIDAGEYR, INDFMPIR-unused, BMXHT-unused, BMXBMI-unused, ...
## Number of terms at each degree of interaction: 1 1 (additive model)
## GCV 1192.459 RSS 4400176 GRSq 0.05246562 RSq 0.05349109
invisible(mars1$results) # model results hidden to reduce clutter. Key observations noted below:
#used 1 of 32 predictors, 2 of 18 terms (nprune =2) degree=1
# RMSE: 35.02993, Rsquared: 0.05354368
plot(mars1, main="MARS Initial")
# MARS model using same X sets as SVM models:
set.seed(123)
mars2 <- train(x = Chol_tr_X_impfin_sv, y = Chol_train_y,
method = "earth",
tuneGrid = expand.grid(degree = 1:3, nprune = 2:38),
trControl = Chol_control)
mars2$finalModel
## Selected 2 of 18 terms, and 1 of 29 predictors (nprune=2)
## Termination condition: RSq changed by less than 0.001 at 18 terms
## Importance: RIDAGEYR, INDFMPIR-unused, BMXHT-unused, BMXBMI-unused, ...
## Number of terms at each degree of interaction: 1 1 (additive model)
## GCV 1192.459 RSS 4400176 GRSq 0.05246562 RSq 0.05349109
plot(mars2, main="MARS Secondary")
# No change between the two MARS models. Drops all but two factors for both.
#nprune=2, degree=1, RMSE: 34.80857, Rsquared: 0.05457991
set.seed(123)
nnetGrid <- expand.grid(decay = c(0, 0.01, .1), size = c(3, 7, 11, 13))
# NNET first rendition will be with Chol_tr_X_imp_fin
set.seed(100)
nnet1 <- train(x = Chol_tr_X_imp_fin, y = Chol_train_y,
method = "nnet",
tuneGrid = nnetGrid,
trControl = Chol_control,
preProc = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 13 * (ncol(Chol_tr_X_imp_fin) + 1) + 13 + 1,
maxit = 100)
invisible(nnet1) # model output hidden to reduce clutter. Key observations noted below:
# size=3, decay=0.1
# RMSE: 40.97813, Rsquared: 0.0051328366
plot(nnet1, main="NNET first")
set.seed(123)
# NNET second rendition will be with Chol_tr_X_impfin_sv
set.seed(100)
nnet2 <- train(x = Chol_tr_X_impfin_sv, y = Chol_train_y,
method = "nnet",
tuneGrid = nnetGrid,
trControl = Chol_control,
preProc = c("center", "scale"),
linout = TRUE,
trace = FALSE,
MaxNWts = 13 * (ncol(Chol_tr_X_impfin_sv) + 1) + 13 + 1,
maxit = 100)
invisible(nnet2) # model output hidden to reduce clutter. Key observations noted below:
# size=13, decay=0.1
# RMSE: 44.38061, Rsquared: 0.001743654
plot(nnet2, main="NNET second")
## Comparing Nonlinear Models: ### Saving Results
NonLpred <- data.frame(obs=Chol_test_y)
NonLpred$svmR <- predict(svmR1, Chol_te_X_sv)
NonLpred$svmP <- predict(svmP, Chol_te_X_sv)
NonLpred$KNN <- predict(knnTune, Chol_te_X_sv)
NonLpred$MARS2 <- predict(mars2, Chol_te_X_sv)
NonLpred$NNET2 <- predict(nnet2, Chol_te_X_sv)
plotpred <- data.frame(x=1:921, y1=NonLpred$obs, y2=NonLpred$svmR,
y3=NonLpred$svmP, y4=NonLpred$KNN,
y5=NonLpred$MARS2[,"y"], y6=NonLpred$NNET2)
plot(plotpred$x, plotpred$y1, type = "l", col = 1,
xlab = "prediction #", ylab = "prediction values",
main = "Nonlinear Model Predictions")
lines(plotpred$x, plotpred$y2, col = 2)
lines(plotpred$x, plotpred$y3, col = 3)
lines(plotpred$x, plotpred$y4, col = 4)
lines(plotpred$x, plotpred$y5, col = 5)
lines(plotpred$x, plotpred$y6, col = 6)
legend("bottomleft", cex=0.5, legend = c("Observed", "SVM-Radial",
"SVM-Polynomial", "KNN", "MARS", "NNET"),
col = 1:6, lwd = 2)
### Getting RMSE and Plotting
# RMSE = sqrt(sum((obs-pred)^2)/n), n=921
getRMSE <- function(x,y) {
sqrt(sum((x-y)^2)/length(x))
}
nonlin_rmse <- data.frame(c("svmRad", "svmPoly", "KNN",
"MARS", "NNet"))
nonlin_rmse$RMSE <- c(getRMSE(NonLpred$obs, NonLpred$svmR),
getRMSE(NonLpred$obs, NonLpred$svmP),
getRMSE(NonLpred$obs, NonLpred$KNN),
getRMSE(NonLpred$obs, NonLpred$MARS2),
getRMSE(NonLpred$obs, NonLpred$NNET2))
colnames(nonlin_rmse)[1]<-"Model Type"
nonlin_rmse[order(nonlin_rmse$RMSE),]
## Model Type RMSE
## 1 svmRad 32.94845
## 2 svmPoly 33.03741
## 4 MARS 33.52618
## 3 KNN 33.85140
## 5 NNet 36.94634
# best non linear model is svm radial with
# sigma = sigma = 0.03504524 and C = 0.25. (RMSE is 32.95)
expDropC <- c("RIDRETH1_Black", "RIDRETH1_Mex", "RIDRETH1_OHis",
"RIDRETH1_Oth", "RIDRETH1_White", "BMXBMI")
Xtrial_tr <- subset(Chol_train_X_imp, select = !(names(Chol_train_X_imp) %in% expDropC))
# looking for near zero var:
Xtri_nzv <- nearZeroVar(Xtrial_tr)
# "DMDBORN4_Ref", "DMDBORN4_DK", "DMDEDUC2_Ref", "DMDEDUC2_DK", "DMDMARTZ_Ref",
# "DMDMARTZ_DK", "AIALANGA_Asian"
Xtri_tr_nz <- subset(Xtrial_tr, select = -c(Xtri_nzv))
print(paste("Xtrial_tr ncol: ", ncol(Xtrial_tr), " NZV removed, new ncol: ", ncol(Xtri_tr_nz)))
## [1] "Xtrial_tr ncol: 47 NZV removed, new ncol: 40"
# dropping SIALANG groups (sample person interview instrument lang)
expDropC <- c("SIALANG_English", "SIALANG_Spanish",
"MIALANG_English", "MIALANG_Spanish", "MIALANG_NA")
Xtri_tr_nz <- subset(Xtri_tr_nz, select = !(names(Xtri_tr_nz) %in% expDropC))
print(paste("Xtrial_tr ncol: ", ncol(Xtrial_tr), " NZV removed, new ncol: ", ncol(Xtri_tr_nz)))
## [1] "Xtrial_tr ncol: 47 NZV removed, new ncol: 35"
#looking for high corr:
Xtritr_hiC <- findCorrelation(cor(Xtri_tr_nz), cutoff = 0.8)
Xtritr_hiC
## [1] 33 18 3 30 8 9 25 10
# "AIALANGA_English", "DMDBORN4_USA", "BMXWT", "FIALANG_English",
# "BMXWAIST", "BMXHIP", "DMDEDUC2_NA", "RIAGENDR_Male"
Xtritr_hiC <- subset(Xtri_tr_nz, select = c("AIALANGA_English", "DMDBORN4_USA",
"BMXWT", "FIALANG_English", "BMXWAIST",
"BMXHIP", "DMDEDUC2_NA", "RIAGENDR_Male"))
invisible(cor(Xtritr_hiC)) # invisible used to reduce extensive output
corrplot(cor(Xtritr_hiC), order = "hclust", type="lower")
#dropping "BMXHIP", "BMXWAIST", DMDEDUC2_<9 (recurring issues in model attempts)
drophiC <- c("BMXHIP", "BMXWAIST", "DMDEDUC2_<9")
X_trial_train <- subset(Xtri_tr_nz, select = !(names(Xtri_tr_nz) %in% drophiC))
corrplot(cor(X_trial_train), order = "hclust", type="lower", tl.cex = 0.7)
keepsies <- colnames(X_trial_train)
X_trial_test <- subset(Chol_test_X_imp, select = c(keepsies))
#svm radial with X_trial_train and log adjusted y
log_Y_train <- log10(Chol_train_y)
hist(log_Y_train)
log_Y_test <- log10(Chol_test_y)
#model
set.seed(123)
svmR_trial <- train(x=X_trial_train, y = log_Y_train,
method = "svmRadial",
preProcess = c("center", "scale"),
tuneLength = 14,
trControl = Chol_control)
invisible(svmR_trial) # output hidden to reduce clutter. Key observations noted below:
# issues in: DMDEDUC2_<9,
# final model uses: sigma = 0.02019005 and C = 0.25.
# RMSE: 0.1531765, Rsquared: 0.021637157
# while these are the lowest values, the graph is identical to the non-log adjusted SVM radial model with just different RMSE values.
plot(svmR_trial, scales = list(x = list(log = 2)), main="SVM-Radial with X_trial_train and Log Y")
svmR_trial$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 0.25
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0201900463201106
##
## Number of Support Vectors : 3367
##
## Objective Function Value : -574.0137
## Training error : 0.850126
trialPred <- data.frame(obs=log_Y_test)
trialPred$svmRad <- predict(svmR_trial, X_trial_test)
set.seed(123)
# let's see how it splits the training data
Chol_rpart <- rpart(Chol_train_y ~., data = Chol_tr_X_imp_fin, cp = 0.003)
invisible(summary(Chol_rpart)) # output hidden to reduce clutter. Key observations noted below:
## Call:
## rpart(formula = Chol_train_y ~ ., data = Chol_tr_X_imp_fin, cp = 0.003)
## n= 3696
##
## CP nsplit rel error xerror xstd
## 1 0.054164414 0 1.0000000 1.0004973 0.03300310
## 2 0.020563310 1 0.9458356 0.9554923 0.03188785
## 3 0.013308632 2 0.9252723 0.9464258 0.03207678
## 4 0.007112960 3 0.9119636 0.9311717 0.03205759
## 5 0.005855210 4 0.9048507 0.9260955 0.03176146
## 6 0.004972343 6 0.8931403 0.9256549 0.03150486
## 7 0.004824224 7 0.8881679 0.9238887 0.03142018
## 8 0.004142728 8 0.8833437 0.9264225 0.03141473
## 9 0.003828087 10 0.8750582 0.9339156 0.03236363
## 10 0.003774630 11 0.8712302 0.9329465 0.03238441
## 11 0.003599490 12 0.8674555 0.9316994 0.03236427
## 12 0.003384505 13 0.8638560 0.9351374 0.03241022
## 13 0.003021559 14 0.8604715 0.9307203 0.03231526
## 14 0.003000000 15 0.8574500 0.9348469 0.03254034
##
## Variable importance
## RIDAGEYR BMXBMI DMDEDUC2_NA DMDMARTZ_Nev BMXARMC
## 32 13 12 9 8
## BMXARML AIALANGA_NA BMXLEG BMXHT RIAGENDR_Male
## 5 5 4 4 4
## INDFMPIR DMDMARTZ_Sep DMDEDUC2_BS+ DMDMARTZ_Mar
## 1 1 1 1
##
## Node number 1: 3696 observations, complexity param=0.05416441
## mean=105.2411, MSE=1257.805
## left son=2 (1098 obs) right son=3 (2598 obs)
## Primary splits:
## RIDAGEYR < -0.7177214 to the left, improve=0.05416441, (0 missing)
## DMDEDUC2_NA < 0.9305478 to the right, improve=0.04807959, (0 missing)
## DMDMARTZ_Nev < -0.2411516 to the right, improve=0.02845850, (0 missing)
## BMXBMI < -0.7742706 to the left, improve=0.02378829, (0 missing)
## BMXARMC < -0.8110234 to the left, improve=0.01974683, (0 missing)
## Surrogate splits:
## DMDEDUC2_NA < 0.9305478 to the right, agree=0.862, adj=0.536, (0 split)
## DMDMARTZ_Nev < -0.2411516 to the right, agree=0.846, adj=0.482, (0 split)
## BMXBMI < -0.9678924 to the left, agree=0.735, adj=0.108, (0 split)
## BMXARMC < -1.151012 to the left, agree=0.730, adj=0.091, (0 split)
##
## Node number 2: 1098 observations, complexity param=0.01330863
## mean=92.54463, MSE=805.3737
## left son=4 (536 obs) right son=5 (562 obs)
## Primary splits:
## BMXBMI < -0.5935568 to the left, improve=0.06996470, (0 missing)
## BMXARMC < -0.113152 to the left, improve=0.06346379, (0 missing)
## RIDAGEYR < -1.398075 to the left, improve=0.04298442, (0 missing)
## DMDEDUC2_NA < 0.9305478 to the right, improve=0.03832009, (0 missing)
## DMDEDUC2_AA < -0.4815614 to the right, improve=0.02453409, (0 missing)
## Surrogate splits:
## BMXARMC < -0.4352465 to the left, agree=0.884, adj=0.763, (0 split)
## BMXARML < -0.3506071 to the left, agree=0.622, adj=0.226, (0 split)
## RIDAGEYR < -1.203688 to the left, agree=0.605, adj=0.190, (0 split)
## DMDEDUC2_NA < 0.9305478 to the right, agree=0.604, adj=0.188, (0 split)
## DMDEDUC2_BS+ < -0.3327601 to the right, agree=0.587, adj=0.153, (0 split)
##
## Node number 3: 2598 observations, complexity param=0.02056331
## mean=110.607, MSE=1352.096
## left son=6 (968 obs) right son=7 (1630 obs)
## Primary splits:
## RIDAGEYR < 0.7887756 to the right, improve=0.027213960, (0 missing)
## AIALANGA_NA < 0.7694033 to the right, improve=0.019766670, (0 missing)
## BMXBMI < 0.6714394 to the right, improve=0.007005182, (0 missing)
## DMDBORN4_Oth < 0.5241397 to the left, improve=0.006384595, (0 missing)
## BMXARML < 0.8611078 to the right, improve=0.005282776, (0 missing)
## Surrogate splits:
## AIALANGA_NA < 0.7694033 to the right, agree=0.787, adj=0.428, (0 split)
## DMDMARTZ_Sep < 0.6618522 to the right, agree=0.653, adj=0.068, (0 split)
## BMXLEG < -2.129812 to the left, agree=0.630, adj=0.007, (0 split)
## BMXHT < -2.253535 to the left, agree=0.629, adj=0.005, (0 split)
## BMXARML < -2.618245 to the left, agree=0.628, adj=0.001, (0 split)
##
## Node number 4: 536 observations
## mean=84.85821, MSE=627.8344
##
## Node number 5: 562 observations, complexity param=0.003828087
## mean=99.87544, MSE=864.6108
## left son=10 (146 obs) right son=11 (416 obs)
## Primary splits:
## RIDAGEYR < -1.398075 to the left, improve=0.03662437, (0 missing)
## BMXBMI < 0.32292 to the left, improve=0.02895393, (0 missing)
## BMXARMC < 0.3163073 to the left, improve=0.02481897, (0 missing)
## DMDEDUC2_NA < 0.9305478 to the right, improve=0.02327611, (0 missing)
## DMDEDUC2_AA < -0.4815614 to the right, improve=0.01764382, (0 missing)
## Surrogate splits:
## DMDEDUC2_NA < 0.9305478 to the right, agree=0.826, adj=0.329, (0 split)
## INDFMPIR < -1.53041 to the left, agree=0.746, adj=0.021, (0 split)
## BMXBMI < -0.5548324 to the left, agree=0.746, adj=0.021, (0 split)
## BMXARMC < -1.043647 to the left, agree=0.746, adj=0.021, (0 split)
## BMXHT < -2.298376 to the left, agree=0.744, adj=0.014, (0 split)
##
## Node number 6: 968 observations, complexity param=0.00711296
## mean=102.7355, MSE=1416.277
## left son=12 (495 obs) right son=13 (473 obs)
## Primary splits:
## RIAGENDR_Male < 0.02868757 to the right, improve=0.02411971, (0 missing)
## BMXHT < -0.4649097 to the right, improve=0.02055340, (0 missing)
## BMXARML < 0.8611078 to the right, improve=0.01581797, (0 missing)
## BMXBMI < 0.3706801 to the right, improve=0.01427778, (0 missing)
## BMXARMC < -0.1310462 to the right, improve=0.01285898, (0 missing)
## Surrogate splits:
## BMXHT < -0.06633024 to the right, agree=0.853, adj=0.700, (0 split)
## BMXLEG < -0.2429857 to the right, agree=0.773, adj=0.535, (0 split)
## BMXARML < 0.2864087 to the right, agree=0.770, adj=0.529, (0 split)
## DMDMARTZ_Mar < -0.1600155 to the right, agree=0.616, adj=0.214, (0 split)
## DMDMARTZ_Sep < 0.6618522 to the left, agree=0.606, adj=0.195, (0 split)
##
## Node number 7: 1630 observations, complexity param=0.00585521
## mean=115.2816, MSE=1255.334
## left son=14 (464 obs) right son=15 (1166 obs)
## Primary splits:
## RIDAGEYR < -0.2317546 to the left, improve=0.011590290, (0 missing)
## BMXBMI < 0.7617963 to the right, improve=0.011509240, (0 missing)
## BMXLEG < -1.843139 to the right, improve=0.007419219, (0 missing)
## DMDBORN4_Oth < 0.5241397 to the left, improve=0.005611986, (0 missing)
## BMXARMC < 1.085755 to the right, improve=0.004611308, (0 missing)
## Surrogate splits:
## BMXBMI < -1.613299 to the left, agree=0.718, adj=0.009, (0 split)
## BMXARMC < -1.741519 to the left, agree=0.718, adj=0.009, (0 split)
## BMXLEG < 2.27452 to the right, agree=0.717, adj=0.006, (0 split)
## BMXARML < 2.817162 to the right, agree=0.717, adj=0.006, (0 split)
##
## Node number 10: 146 observations
## mean=90.37671, MSE=687.0841
##
## Node number 11: 416 observations
## mean=103.2091, MSE=884.1366
##
## Node number 12: 495 observations, complexity param=0.004824224
## mean=97.02222, MSE=1260.83
## left son=24 (257 obs) right son=25 (238 obs)
## Primary splits:
## RIDAGEYR < 1.177549 to the right, improve=0.03593447, (0 missing)
## AIALANGA_NA < 0.7694033 to the right, improve=0.03260307, (0 missing)
## BMXLEG < 1.59693 to the left, improve=0.02390759, (0 missing)
## BMXBMI < -0.2837618 to the right, improve=0.02065733, (0 missing)
## BMXARMC < 1.318379 to the right, improve=0.01047926, (0 missing)
## Surrogate splits:
## AIALANGA_NA < 0.7694033 to the right, agree=0.970, adj=0.937, (0 split)
## RIDRETH3_White < 0.3645385 to the right, agree=0.586, adj=0.139, (0 split)
## BMXARMC < 0.2268366 to the left, agree=0.578, adj=0.122, (0 split)
## RIDRETH3_Black < 0.5542921 to the left, agree=0.570, adj=0.105, (0 split)
## AIALANGA_English < -2.469158 to the right, agree=0.570, adj=0.105, (0 split)
##
## Node number 13: 473 observations, complexity param=0.00377463
## mean=108.7146, MSE=1509.045
## left son=26 (156 obs) right son=27 (317 obs)
## Primary splits:
## BMXBMI < 0.3964963 to the right, improve=0.02458421, (0 missing)
## INDFMPIR < -0.05039062 to the left, improve=0.02189095, (0 missing)
## BMXARMC < 1.19312 to the right, improve=0.01510029, (0 missing)
## BMXLEG < -2.520729 to the right, improve=0.01028420, (0 missing)
## BMXARML < 0.6187648 to the right, improve=0.01009494, (0 missing)
## Surrogate splits:
## BMXARMC < 0.2304154 to the right, agree=0.854, adj=0.558, (0 split)
## BMXARML < 0.7226261 to the right, agree=0.706, adj=0.109, (0 split)
## INDFMPIR < -1.308407 to the left, agree=0.672, adj=0.006, (0 split)
## BMXHT < 0.6610774 to the right, agree=0.672, adj=0.006, (0 split)
##
## Node number 14: 464 observations, complexity param=0.004972343
## mean=109.2349, MSE=1026.456
## left son=28 (91 obs) right son=29 (373 obs)
## Primary splits:
## BMXBMI < -0.6839137 to the left, improve=0.04853424, (0 missing)
## RIAGENDR_Male < 0.02868757 to the left, improve=0.03776510, (0 missing)
## BMXARMC < -0.6499762 to the left, improve=0.03739263, (0 missing)
## BMXLEG < 0.03326233 to the left, improve=0.01273680, (0 missing)
## DMDBORN4_Oth < 0.5241397 to the left, improve=0.01123883, (0 missing)
## Surrogate splits:
## BMXARMC < -0.8110234 to the left, agree=0.894, adj=0.462, (0 split)
##
## Node number 15: 1166 observations, complexity param=0.00585521
## mean=117.6878, MSE=1326.074
## left son=30 (265 obs) right son=31 (901 obs)
## Primary splits:
## BMXBMI < 0.7230719 to the right, improve=0.019870600, (0 missing)
## RIDAGEYR < 0.6915823 to the right, improve=0.007605915, (0 missing)
## BMXARMC < 1.085755 to the right, improve=0.006445164, (0 missing)
## BMXLEG < -1.843139 to the right, improve=0.005717149, (0 missing)
## BMXARML < 0.5841444 to the right, improve=0.004316487, (0 missing)
## Surrogate splits:
## BMXARMC < 1.085755 to the right, agree=0.886, adj=0.498, (0 split)
## BMXLEG < -2.416484 to the left, agree=0.776, adj=0.015, (0 split)
## BMXARML < 2.869092 to the right, agree=0.774, adj=0.004, (0 split)
##
## Node number 24: 257 observations
## mean=90.54475, MSE=899.7889
##
## Node number 25: 238 observations, complexity param=0.003021559
## mean=104.0168, MSE=1556.462
## left son=50 (228 obs) right son=51 (10 obs)
## Primary splits:
## BMXLEG < 1.570869 to the left, improve=0.03791935, (0 missing)
## DMDEDUC2_9-11 < 1.224965 to the left, improve=0.03211565, (0 missing)
## BMXBMI < -0.3483024 to the right, improve=0.02225825, (0 missing)
## BMXHT < 1.478165 to the left, improve=0.02098695, (0 missing)
## DMDMARTZ_Nev < 0.7669317 to the right, improve=0.01875647, (0 missing)
## Surrogate splits:
## BMXHT < 2.235466 to the left, agree=0.966, adj=0.2, (0 split)
##
## Node number 26: 156 observations
## mean=100.0321, MSE=1199.569
##
## Node number 27: 317 observations
## mean=112.9874, MSE=1605.987
##
## Node number 28: 91 observations
## mean=94.94505, MSE=776.6014
##
## Node number 29: 373 observations, complexity param=0.003384505
## mean=112.7212, MSE=1025.44
## left son=58 (215 obs) right son=59 (158 obs)
## Primary splits:
## RIAGENDR_Male < 0.02868757 to the left, improve=0.04113595, (0 missing)
## BMXLEG < 0.03326233 to the left, improve=0.01961291, (0 missing)
## BMXBMI < 2.071971 to the right, improve=0.01668855, (0 missing)
## DMDBORN4_Oth < 0.5241397 to the left, improve=0.01517982, (0 missing)
## BMXHT < -0.0165078 to the left, improve=0.01417552, (0 missing)
## Surrogate splits:
## BMXHT < 0.5016456 to the left, agree=0.831, adj=0.601, (0 split)
## BMXARML < 0.2033197 to the left, agree=0.769, adj=0.456, (0 split)
## BMXLEG < 0.2938737 to the left, agree=0.753, adj=0.418, (0 split)
## INDFMPIR < -0.1527586 to the left, agree=0.619, adj=0.101, (0 split)
## BMXARMC < -0.0952579 to the left, agree=0.590, adj=0.032, (0 split)
##
## Node number 30: 265 observations
## mean=108.2226, MSE=1089.109
##
## Node number 31: 901 observations, complexity param=0.004142728
## mean=120.4717, MSE=1361.67
## left son=62 (867 obs) right son=63 (34 obs)
## Primary splits:
## BMXLEG < -1.843139 to the right, improve=0.014884930, (0 missing)
## BMXARMC < 1.211014 to the left, improve=0.009461288, (0 missing)
## RIDRETH3_Black < 0.5542921 to the right, improve=0.008043446, (0 missing)
## INDFMPIR < -0.7231829 to the left, improve=0.007061246, (0 missing)
## BMXBMI < -0.2708537 to the left, improve=0.006776726, (0 missing)
## Surrogate splits:
## BMXHT < -1.954601 to the right, agree=0.971, adj=0.235, (0 split)
## BMXARML < -2.323971 to the right, agree=0.966, adj=0.088, (0 split)
##
## Node number 50: 228 observations
## mean=102.4079, MSE=1440.917
##
## Node number 51: 10 observations
## mean=140.7, MSE=2786.21
##
## Node number 58: 215 observations
## mean=107.1535, MSE=854.0927
##
## Node number 59: 158 observations
## mean=120.2975, MSE=1159.019
##
## Node number 62: 867 observations, complexity param=0.00359949
## mean=119.5802, MSE=1272.986
## left son=124 (210 obs) right son=125 (657 obs)
## Primary splits:
## INDFMPIR < -0.7743669 to the left, improve=0.015161550, (0 missing)
## BMXARMC < 1.211014 to the left, improve=0.011285510, (0 missing)
## BMXBMI < -0.2708537 to the left, improve=0.008664953, (0 missing)
## RIDAGEYR < 0.6915823 to the right, improve=0.006674608, (0 missing)
## RIDRETH3_Black < 0.5542921 to the right, improve=0.006344337, (0 missing)
## Surrogate splits:
## BMXARMC < -2.036772 to the left, agree=0.760, adj=0.010, (0 split)
## BMXBMI < -1.548758 to the left, agree=0.759, adj=0.005, (0 split)
##
## Node number 63: 34 observations, complexity param=0.004142728
## mean=143.2059, MSE=3085.987
## left son=126 (27 obs) right son=127 (7 obs)
## Primary splits:
## BMXARML < -1.164187 to the left, improve=0.19305520, (0 missing)
## BMXHT < -1.112601 to the left, improve=0.16323750, (0 missing)
## INDFMPIR < -0.8033506 to the right, improve=0.11564990, (0 missing)
## RIDAGEYR < 0.1084221 to the left, improve=0.07737871, (0 missing)
## DMDEDUC2_AA < 0.3718316 to the right, improve=0.07447832, (0 missing)
## Surrogate splits:
## BMXHT < -1.022921 to the left, agree=0.912, adj=0.571, (0 split)
## BMXBMI < 0.4971797 to the left, agree=0.824, adj=0.143, (0 split)
##
## Node number 124: 210 observations
## mean=111.8095, MSE=1186.621
##
## Node number 125: 657 observations
## mean=122.0639, MSE=1275.122
##
## Node number 126: 27 observations
## mean=130.7778, MSE=1924.099
##
## Node number 127: 7 observations
## mean=191.1429, MSE=4673.837
# 127 nodes. means and MSE of the terminal nodes vary widely
rpart.plot(Chol_rpart)
# tune and predict
Chol_rpart_tune <- train(x = Chol_tr_X_imp_fin, y = Chol_train_y, method = "rpart", cp = 0.003)
Chol_rpart_pred <- predict(Chol_rpart_tune, Chol_test_X_imp)
postResample(pred = Chol_rpart_pred, obs = Chol_test_y)
## RMSE Rsquared MAE
## 33.0113573 0.0659519 25.8550917
# Rsquared value of 0.066 isn't too great. RMSE of 33.0. Let's compare to other trees
set.seed(123)
Chol_rpart2_tune <- train(x = Chol_tr_X_imp_fin, y = Chol_train_y, method = "rpart2", maxdepth = 6)
Chol_rpart2_pred <- predict(Chol_rpart2_tune, Chol_test_X_imp)
postResample(pred = Chol_rpart2_pred, obs = Chol_test_y)
## RMSE Rsquared MAE
## 32.93397878 0.07117066 25.65328561
# minor improvement? Rsquared value of 0.071 and RMSE of 32.9
set.seed(123)
Chol_bagtree <- train(x=Chol_tr_X_imp_fin, y = Chol_train_y, method = "treebag", nbagg = 70, cp = 0.003, trControl = Chol_control)
Chol_bagtree
## Bagged CART
##
## 3696 samples
## 32 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 51, 50, 48, 50, 46, 46, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 35.31841 0.03281359 27.35183
# Rsquared value of 0.033. RMSE is 35.3. Still not fantastic
set.seed(123)
rfmtryValues <- seq(1,10,1)
Chol_rf <- train(x = Chol_tr_X_imp_fin, y = Chol_train_y, method = "rf", ntree = 300, tuneGrid = data.frame(mtry=rfmtryValues), trControl=Chol_control) # 300 trees provides the highest Rsquared value when checking with test data
invisible(Chol_rf) # output hidden to reduce clutter
plot(Chol_rf)
Chol_rf_pred <- predict(Chol_rf, Chol_test_X_imp)
postResample(pred = Chol_rf_pred, obs = Chol_test_y)
## RMSE Rsquared MAE
## 33.06977523 0.07717907 26.17119444
# Rsquared value of 0.077. RMSE of 33.1
# some control parameters
gbmGrid <- expand.grid(interaction.depth = c(1,3,5,7,9), n.trees=300, shrinkage = c(0.01, 0.1), n.minobsinnode=5)
set.seed(123)
Chol_gbm <- train(x = Chol_tr_X_imp_fin, y = Chol_train_y, method = "gbm", tuneGrid = gbmGrid, verbose = FALSE, trControl = Chol_control)
invisible(Chol_gbm) # output hidden to reduce clutter. Key observations noted below:
# shrinkage of 0.01 and a smaller interaction depth provided models with the lowest RMSE values
Chol_gbm_pred <- predict(Chol_gbm, Chol_test_X_imp)
postResample(pred = Chol_gbm_pred, obs = Chol_test_y)
## RMSE Rsquared MAE
## 32.86446826 0.08137564 25.82653202
# RMSE 32.9 and Rsquared 0.081
# decision tree with linear regression at terminal nodes to predict continuous variables
set.seed(123)
Chol_M5 <- train(x = Chol_tr_X_imp_fin, y = Chol_train_y, method = "M5", trControl = Chol_control, control = Weka_control(M=10))
plot(Chol_M5)
Chol_M5_pred <- predict(Chol_M5, Chol_test_X_imp)
postResample(pred = Chol_M5_pred, obs = Chol_test_y)
## RMSE Rsquared MAE
## 33.61012224 0.07641278 26.16774336
# Rsquared value of 0.076 and RMSE of 33.6
# decision tree with linear regression at terminal nodes to predict continuous variables
set.seed(123)
Chol_M5rules <- train(x = Chol_tr_X_imp_fin, y = Chol_train_y, method = "M5Rules", trControl = Chol_control, control = Weka_control(M=10))
plot(Chol_M5rules)
Chol_M5rules_pred <- predict(Chol_M5rules, Chol_test_X_imp)
postResample(pred = Chol_M5rules_pred, obs = Chol_test_y)
## RMSE Rsquared MAE
## 32.75281564 0.08668215 25.26610200
# Slight improvement, Rsquared value of 0.087 and RMSE of 32.8
set.seed(123)
Chol_cube <- train(x = Chol_tr_X_imp_fin, y = Chol_train_y, method = "cubist", trControl = Chol_control)
plot(Chol_cube)
Chol_cube_pred <- predict(Chol_cube, Chol_test_X_imp)
postResample(pred = Chol_cube_pred, obs = Chol_test_y)
## RMSE Rsquared MAE
## 32.63815970 0.09218122 25.11596950
# "Best performing" so far, but ever so slightly over the other models. Rsquared of 0.092 and RMSE of 32.6
NonLpred <- NonLpred %>% select(!obs)
Trees <- cbind.data.frame(Cubist = Chol_cube_pred, GBM = Chol_gbm_pred, M5 = Chol_M5_pred, M5Rules = Chol_M5rules_pred, RF = Chol_rf_pred, CART = Chol_rpart2_pred)
Results <- cbind(Linear_res, NonLpred, Trees)
find_rmse <- function(x){
caret::RMSE(x, Results[,"Observed"])
}
RMSE_results <- apply(X = Results[,2:22], FUN = find_rmse, MARGIN = 2)
RMSE_results <- data.frame(RMSE_results)
RMSE_results$Model <- rownames(RMSE_results)
RMSE_results$Model_Type <- "Linear"
RMSE_results$Model_Type[11:15] <- "Non-Linear"
RMSE_results$Model_Type[16:21] <- "Tree"
ggplot(RMSE_results, aes(x=reorder(Model, -RMSE_results), y=RMSE_results)) + geom_segment(aes(x=reorder(Model, -RMSE_results), xend = reorder(Model, -RMSE_results), y=30, yend=RMSE_results, color = Model_Type)) + geom_point(aes(color=Model_Type), size = 9) + coord_flip() + ylab("RMSE") + geom_text(aes(label = round(RMSE_results, 2)), color = "black", size = 2.5, fontface = "bold") + labs(color = "Model Type") + xlab("Model")
cubist_plot <- Results %>% select(Observed, Cubist)
cubist_plot$x <- c(1:921)
cubist_plot <- cubist_plot %>% gather(key = "Data Source", value = "LDL", -x)
# cubist_plot$alpha <- ifelse(cubist_plot$Observed == "Observed", 0.8, 1)
ggplot(cubist_plot, aes(x = x, y = LDL)) + geom_line(aes(color = `Data Source`, alpha = `Data Source`)) + scale_color_manual(values = c("royalblue1", "royalblue4")) + scale_alpha_manual(values = c(1,.3)) + theme_classic()